//------------------------------------------------------------------------------
// GB_builder: build a matrix from tuples
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2025, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// CALLED BY: GB_build, GB_wait, GB_transpose, GB_concat_hyper

// This function is called by GB_build to build a matrix T for GrB_Matrix_build
// or GrB_Vector_build, by GB_wait to build a matrix T from the list of pending
// tuples, and by GB_transpose to transpose a matrix or vector.  Duplicates can
// appear if called by GB_build or GB_wait, but not GB_transpose.

// The indices are provided either as (I_input,J_input) or (I_work,J_work), not
// both.  The values are provided as S_input or S_work, not both.  On return,
// the *work arrays are either transplanted into T, or freed, since they are
// temporary workspaces.

// The work is done in major 5 Steps, some of which can be skipped, depending
// on how the tuples are provided (*_work or *_input), and whether or not they
// are sorted, or have duplicates.  If vdim <= 1, some work is skipped (for
// GrB_Vectors, and single-vector GrB_Matrices).  Let e be the of tuples on
// input.  Let p be the # of threads used.

// STEP 1: copy user input.  O(e/p) read/write per thread, or skipped.

// STEP 2: sort the tuples.  Time: O((e log e)/p), read/write, or skipped if
//         the tuples are already sorted.

// STEP 3: count vectors and duplicates.  O(e/p) reads, per thread, if no
//         duplicates, or skipped if already done.  O(e/p) read/writes
//         per thread if duplicates appear.

// STEP 4: construct T->h and T->p.  O(e/p) reads per thread, or skipped if
//         T is a vector.

// STEP 5: assemble the tuples.  O(e/p) read/writes per thread, or O(1) if the
//         values can be transplanted into T as-is.

// For GrB_Matrix_build:  If the input (I_input, J_input, S_input) is already
// sorted with no duplicates, and no typecasting needs to be done, then Step 1
// still must be done (each thread does O(e/p) reads of (I_input,J_input) and
// writes to I_work), but Step 1 also does the work for Step 3.  Step 2 and 3
// are skipped.  Step 4 does O(e/p) reads per thread (J_input only).  Then
// I_work is transplanted into T->i.  Step 5 does O(e/p) read/writes per thread
// to copy Sx into T->x.

// For GrB_Vector_build: as GrB_Matrix_build, Step 1 does O(e/p) read/writes
// per thread.  The input is always a vector, so vdim == 1 always holds.  Step
// 2 is skipped if the indices are already sorted, and Step 3 does no work at
// all unless duplicates appear.  Step 4 takes no time, for any vector. Step 5
// does O(e/p) reads/writes per thread.

// For GB_wait:  the pending tuples are provided as I_work, J_work, and S_work,
// so Step 1 is skipped (no need to check for invalid indices).  The input
// J_work may be null (vdim can be anything, since GB_wait is used for both
// vectors and matrices).  The tuples might be in sorted order already, which
// is known precisely known from A->Pending->sorted.  Step 2 does
// O((e log e)/p) work to sort the tuples.  Duplicates may appear, and
// out-of-order tuples are likely.  Step 3 does O(e/p) read/writes.  Step 4
// does O(e/p) reads per thread of (I_work,J_work), or just I_work.  Step 5
// does O(e/p) read/writes per thread, or O(1) time if S_work can be
// transplanted into T->x.

// For GB_transpose: uses I_work, J_work, and either S_input (if no op applied
// to the values) or S_work (if an op was applied to the A->x values).  This is
// only done for matrices, not vectors, so vdim > 1 will always hold.  The
// indices are valid so Step 1 is skipped.  The tuples are not sorted, so Step
// 2 takes O((e log e)/p) time to do the sort.  There are no duplicates, so
// Step 3 only does O(e/p) reads of J_work to count the vectors in each slice.
// Step 4 only does O(e/p) reads of J_work to compute T->h and T->p.  Step 5
// does O(e/p) read/writes per thread, but it uses the simpler case in
// GB_bld_template since no duplicates can appear.  It is unlikely
// able to transplant S_work into T->x since the input will almost always be
// unsorted.

// For GB_concat_hyper:  uses I_work, J_work, and S_work.  No duplicates
// appear.  Tuples are not sorted on input.  I_work is transplanted into C->i.
// J_work and S_work are freed on output.  S_work is not transplanted into
// C->x.

// For iso inputs/outputs: T and Sx have the same iso property.  If they are
// iso, then dup is always NULL.  Duplicates may or may not appear if T and Sx
// are iso.

//  (1) GrB_Matrix_build, GrB_Vector_build, and GB_wait do not pass in an iso
//      Sx array, where Sx is S_input for GrB*build, and S_work for GB_wait.
//      Sx and Tx are not iso.  Duplicates may appear.  dup is always present
//      for GrB*build, but may be either NULL or non-NULL for GB_wait.

//  (2) GxB_Matrix_build_Scalar and GxB_Vector_build_Scalar: always construct
//      iso matrices.  For those methods Sx and Tx are always iso, and no dup
//      operator is be passed in (dup is NULL here, which is the implied 2nd
//      operator).  Duplicates may appear.

//  (3) GB_transpose and GB_concat_hyper can pass in Sx as iso or non-iso, and
//      always passes in dup as NULL since there are no duplicates.  Sx and Tx
//      are either both iso, or both non-iso.

// This method always returns T as hypersparse, and T is iso if and only if Sx
// is iso.

// FUTURE: add a descriptor that allows input I,J to be 1-based

#include "builder/GB_build.h"
#include "sort/GB_sort.h"
#include "binaryop/GB_binop.h"
#include "jitifyer/GB_stringify.h"
#ifndef GBCOMPACT
#include "GB_control.h"
#include "FactoryKernels/GB_bld__include.h"
#endif

#define GB_I_WORK(t) (((t) < 0) ? -1 : GB_IGET (I_work, t))
#define GB_J_WORK(t) (((t) < 0) ? -1 : (J_work ? GB_IGET (J_work, t) :  0  ))
#define GB_K_WORK(t) (((t) < 0) ? -1 : (K_work ? GB_IGET (K_work, t) : (t) ))

#define GB_FREE_WORKSPACE                           \
{                                                   \
    GB_WERK_POP (Work, int64_t) ;                   \
    GB_FREE_MEMORY (I_work_handle, *I_work_size_handle) ;  \
    GB_FREE_MEMORY (J_work_handle, *J_work_size_handle) ;  \
    GB_FREE_MEMORY (S_work_handle, *S_work_size_handle) ;  \
    GB_FREE_MEMORY (&K_work, K_work_size) ;           \
}

#define GB_FREE_ALL GB_FREE_WORKSPACE

//------------------------------------------------------------------------------
// GB_builder
//------------------------------------------------------------------------------

GrB_Info GB_builder                 // build a matrix from tuples
(
    GrB_Matrix T,                   // matrix to build, static or dynamic header
    const GrB_Type ttype,           // type of output matrix T
    const int64_t vlen,             // length of each vector of T
    const int64_t vdim,             // number of vectors in T
    const bool is_csc,              // true if T is CSC, false if CSR
    void **I_work_handle,           // for (i,k) or (j,i,k) tuples
    size_t *I_work_size_handle,
    void **J_work_handle,           // for (j,i,k) tuples
    size_t *J_work_size_handle,
    GB_void **S_work_handle,        // array of values of tuples, size ijslen,
                                    // or size 1 if S is iso
    size_t *S_work_size_handle,
    bool known_sorted,              // true if tuples known to be sorted
    bool known_no_duplicates,       // true if tuples known to not have dupl
    int64_t ijslen,                 // size of I_work and J_work arrays
    const bool is_matrix,           // true if T a GrB_Matrix, false if vector
    const void *restrict I_input,   // original indices, size nvals
    const void *restrict J_input,   // original indices, size nvals
    const GB_void *restrict S_input,// array of values of tuples, size nvals,
                                    // or size 1 if S_input or S_work are iso
    const bool S_iso,               // true if S_input or S_work are iso
    const int64_t nvals,            // number of tuples, and size of K_work
    GrB_BinaryOp dup,               // binary function to assemble duplicates,
                                    // if NULL use the SECOND operator to
                                    // keep the most recent duplicate.
    const GrB_Type stype,           // the type of S_work or S_input
    bool do_burble,                 // if true, then burble is allowed
    GB_Werk Werk,
    bool I_is_32,       // true if I (I_work or I_input) is 32 bit, false if 64
    bool J_is_32,       // true if J (J_work or J_input) is 32 bit, false if 64
    bool Tp_is_32,      // true if T->p is built as 32 bit, false if 64
    bool Tj_is_32,      // true if T->h is built as 32 bit, false if 64
    bool Ti_is_32       // true if T->i is built as 32 bit, false if 64
)
{

    //--------------------------------------------------------------------------
    // check inputs
    //--------------------------------------------------------------------------

    GrB_Info info ;
    ASSERT (T != NULL) ;            // T is a static or dynamic header on input 
    ASSERT (nvals >= 0) ;
    ASSERT_TYPE_OK (ttype, "ttype for builder", GB0) ;
    ASSERT_BINARYOP_OK_OR_NULL (dup, "dup for builder", GB0) ;
    ASSERT (I_work_handle != NULL) ;
    ASSERT (J_work_handle != NULL) ;
    ASSERT (S_work_handle != NULL) ;
    ASSERT (!GB_OP_IS_POSITIONAL (dup)) ;
    ASSERT (I_work_size_handle != NULL) ;
    ASSERT (J_work_size_handle != NULL) ;
    ASSERT (S_work_size_handle != NULL) ;

    //--------------------------------------------------------------------------
    // get Sx
    //--------------------------------------------------------------------------

    GB_void *restrict S_work = (*S_work_handle) ;
    const GB_void *restrict Sx = (S_work == NULL) ? S_input : S_work ;
    ASSERT (GB_IMPLIES (nvals > 0, Sx != NULL)) ;
    ASSERT (GB_IMPLIES (S_iso, ttype == stype)) ;
    ASSERT (GB_IMPLIES (S_iso, dup == NULL)) ;

    //==========================================================================
    // symbolic phase of the build =============================================
    //==========================================================================

    // The symbolic phase sorts the tuples and finds any duplicates.  The
    // output matrix T is constructed (not including T->i and T->x), and T->h
    // and T->p are computed.  Then I_work is transplanted into T->i, or T->i is
    // allocated.  T->x is then allocated.  It is not computed until the
    // numeric phase.

    // When this function returns, I_work is either freed or transplanted into
    // T->i.  J_work is freed, and the I_work and J_work pointers (in the
    // caller) are set to NULL by setting their handles to NULL.  Note that
    // J_work may already be NULL on input, if T has one or zero vectors
    // (J_work_handle is always non-NULL however).

    void *restrict I_work = (*I_work_handle) ;
    void *restrict J_work = (*J_work_handle) ;

    GB_IDECL (I_work,       , u) ; GB_IPTR (I_work , I_is_32) ;
    GB_IDECL (I_input, const, u) ; GB_IPTR (I_input, I_is_32) ;
    GB_IDECL (J_work,       , u) ; GB_IPTR (J_work , J_is_32) ;
    GB_IDECL (J_input, const, u) ; GB_IPTR (J_input, J_is_32) ;

    // K_is_32 does not use GB_determine_i_is_32 since it is not transplanted
    // into the output matrix T.
    bool K_is_32 = (nvals < UINT32_MAX) ;
    GB_MDECL (K_work, , u) ; size_t K_work_size = 0 ;

    Tj_is_32 = GB_determine_j_is_32 (Tj_is_32, vdim) ;     // OK
    Ti_is_32 = GB_determine_i_is_32 (Ti_is_32, vlen) ;     // OK

    // duplicate indices are flagged using an out-of-range index, after
    // any out-of-range indices on input have been checked.
    int64_t duplicate_entry = vlen ;

    //--------------------------------------------------------------------------
    // determine the number of threads to use
    //--------------------------------------------------------------------------

    int nthreads_max = GB_Context_nthreads_max ( ) ;
    double chunk = GB_Context_chunk ( ) ;
    int nthreads = GB_nthreads (nvals, chunk, nthreads_max) ;

    //--------------------------------------------------------------------------
    // allocate workspace
    //--------------------------------------------------------------------------

    GB_WERK_DECLARE (Work, int64_t) ;
    GB_WERK_PUSH (Work, 5*(nthreads+1), int64_t) ;
    if (Work == NULL)
    { 
        // out of memory
        GB_FREE_WORKSPACE ;
        return (GrB_OUT_OF_MEMORY) ;
    }

    memset (Work, 0, Work_nitems * sizeof (int64_t)) ;
    int64_t *restrict tstart_slice = Work ;                  // nthreads+1
    int64_t *restrict tnvec_slice  = Work +   (nthreads+1) ; // nthreads+1
    int64_t *restrict tnz_slice    = Work + 2*(nthreads+1) ; // nthreads+1
    int64_t *restrict kbad         = Work + 3*(nthreads+1) ; // nthreads
    int64_t *restrict ilast_slice  = Work + 4*(nthreads+1) ; // nthreads

    //--------------------------------------------------------------------------
    // partition the tuples for the threads
    //--------------------------------------------------------------------------

    // Thread tid handles tuples tstart_slice [tid] to tstart_slice [tid+1]-1.
    // Each thread handles about the same number of tuples.  This partition
    // depends only on nvals.

    GB_e_slice (tstart_slice, nvals, nthreads) ;

    // tstart_slice [tid]: first tuple in slice tid
    // tnvec_slice [tid]: # of vectors that start in a slice.  If a vector
    //                    starts in one slice and ends in another, it is
    //                    counted as being in the first slice.
    // tnz_slice   [tid]: # of entries in a slice after removing duplicates

    // sentinel values for the final cumulative sum
    tnvec_slice [nthreads] = 0 ;
    tnz_slice   [nthreads] = 0 ;

    // this becomes true if the first pass computes tnvec_slice and tnz_slice,
    // and if the (I_input,J_input) tuples were found to be already sorted with
    // no duplicates present.
    bool tnvec_and_tnz_slice_computed = false ;

    //--------------------------------------------------------------------------
    // STEP 1: copy user input and check if valid
    //--------------------------------------------------------------------------

    // If the indices are provided by (I_input,J_input), then import them into
    // (I_work,J_work) and check if they are valid, and sorted.   If the input
    // happens to be already sorted, then duplicates are detected and the # of
    // vectors in each slice is counted.

    if (I_work == NULL)
    {

        //----------------------------------------------------------------------
        // allocate I_work
        //----------------------------------------------------------------------

        // allocate workspace to load and sort the index tuples:

        // vdim <= 1: I_work and K_work for (i,k) tuples, where i = I_input [k]

        // vdim > 1: also J_work for (j,i,k) tuples where i = I_input [k] and
        // j = J_input [k].  If the tuples are found to be already sorted on
        // input, then J_work is not allocated, and J_input is used instead.

        // The k value in the tuple gives the position in the original set of
        // tuples: I_input [k] and Sx [k] when vdim <= 1, and also J_input [k]
        // for matrices with vdim > 1.

        // The workspace I_work is allocated here but freed (or transplanted)
        // inside GB_builder.  J_work and K_work are allocated, used, and freed
        // in GB_builder.

        ASSERT (J_work == NULL) ;

        // Revise I_is_32 for I_work.  This has no effect on I_input32 and
        // I_input64 since they are already assigned with the original value of
        // I_is_32.  There are four options for choosing I_is_32.

        // (1) This option uses GB_determine_i_is_32 since I_work can be
        // transplanted into T->i.  If 32-bit integers are used when
        // max(vlen,vdim) is > GB_NMAX32, then I_work must be cast to T->i when
        // transplanted later.  This option avoids the cast.  However, the sort
        // (if needed) would be a bit slower, and the transplant can only
        // happen if there are no duplicates.  If there are duplicates, T->i is
        // computed out-of-place from I_work, so there is no cast.
        // I_is_32 = GB_determine_i_is_32 (true, vlen) ;   // OK

        // (2) This option allows the widest use of 32-bit indices for I_work,
        // which speeds up the sort when max(vlen,vdim) > GB_NMAX32.  However,
        // the sort may be skipped, and if there are no duplicates, I_work is
        // transplanted into T->i.  In that case, this is slightly slower than
        // option (1), but it does use the least amount of memory of all 4
        // options when duplicates appear.
        // I_is_32 = vlen < UINT32_MAX ;

        // (3) This option ensures I_work can be transplanted into T->i with no
        // cast when no duplicates appear.  The caller can request 64-bit
        // indices for T->i (with Ti_is_32 = false on input) even for small
        // matrices.  However, this option makes the sort (if needed) slower.
        // It uses the least amount of memory when no duplicates appear, since
        // I_work can then always be transplanted into T->i with no extra time
        // and memory.
        I_is_32 = Ti_is_32 ;

        // (4) Do nothing, letting I_is_32 be determined by the integer size
        // of the I_input array.

        I_work = GB_MALLOC_MEMORY (nvals,
            I_is_32 ? sizeof (uint32_t) : sizeof (uint64_t),
            I_work_size_handle) ;

        (*I_work_handle) = I_work ;
        ijslen = nvals ;
        if (I_work == NULL)
        { 
            // out of memory
            GB_FREE_WORKSPACE ;
            return (GrB_OUT_OF_MEMORY) ;
        }

        GB_IPTR (I_work, I_is_32) ;

        //----------------------------------------------------------------------
        // create the tuples to sort, and check for any invalid indices
        //----------------------------------------------------------------------

        known_sorted = true ;
        bool no_duplicates_found = true ;

        if (nvals == 0)
        { 

            // nothing to do

        }
        else if (is_matrix)
        {

            //------------------------------------------------------------------
            // C is a matrix; check both I_input and J_input
            //------------------------------------------------------------------

            ASSERT (J_input != NULL) ;
            ASSERT (I_work != NULL) ;
            ASSERT (vdim >= 0) ;
            ASSERT (I_input != NULL) ;

            int tid ;
            #pragma omp parallel for num_threads(nthreads) schedule(static) \
                reduction(&&:known_sorted) reduction(&&:no_duplicates_found)
            for (tid = 0 ; tid < nthreads ; tid++)
            {
                kbad [tid] = -1 ;
                int64_t my_tnvec = 0 ;
                int64_t kstart = tstart_slice [tid] ;
                int64_t kend   = tstart_slice [tid+1] ;
                int64_t ilast  = (kstart == 0) ? (-1) :
                                 GB_IGET (I_input, kstart-1) ;
                int64_t jlast  = (kstart == 0) ? (-1) :
                                 GB_IGET (J_input, kstart-1) ;

                for (int64_t k = kstart ; k < kend ; k++)
                {
                    // get k-th index from user input: (i,j)
                    int64_t i = GB_IGET (I_input, k) ;
                    int64_t j = GB_IGET (J_input, k) ;

                    if (i < 0 || i >= vlen || j < 0 || j >= vdim)
                    { 
                        // halt if out of bounds
                        kbad [tid] = k ;
                        break ;
                    }

                    // check if the tuples are already sorted
                    known_sorted = known_sorted &&
                        ((jlast < j) || (jlast == j && ilast <= i)) ;

                    // check if this entry is a duplicate of the one before it
                    no_duplicates_found = no_duplicates_found &&
                        (!(jlast == j && ilast == i)) ;

                    // copy the tuple into I_work.  J_work is done later.
                    // Iwork [k] = i
                    GB_ISET (I_work, k, i) ;

                    if (j > jlast)
                    { 
                        // vector j starts in this slice (but this is
                        // valid only if J_input is sorted on input)
                        my_tnvec++ ;
                    }

                    // log the last index seen
                    ilast = i ; jlast = j ;
                }

                // these are valid only if I_input and J_input are sorted on
                // input, with no duplicates present.
                tnvec_slice [tid] = my_tnvec ;
                tnz_slice   [tid] = kend - kstart ;
            }

            // collect the report from each thread
            for (int tid = 0 ; tid < nthreads ; tid++)
            {
                if (kbad [tid] >= 0)
                { 
                    // invalid index
                    uint64_t i = GB_IGET (I_input, kbad [tid]) ;
                    uint64_t j = GB_IGET (J_input, kbad [tid]) ;
                    uint64_t row = is_csc ? i : j ;
                    uint64_t col = is_csc ? j : i ;
                    uint64_t nrows = is_csc ? vlen : vdim ;
                    uint64_t ncols = is_csc ? vdim : vlen ;
                    GB_FREE_WORKSPACE ;
                    GB_ERROR (GrB_INDEX_OUT_OF_BOUNDS,
                        "index (" GBu "," GBu ") out of bounds,"
                        " must be < (" GBu ", " GBu ")",
                        row, col, nrows, ncols) ;
                }
            }

            // if the tuples were found to be already in sorted order, and if
            // no duplicates were found, then tnvec_slice and tnz_slice are now
            // valid, Otherwise, they can only be computed after sorting.
            tnvec_and_tnz_slice_computed = known_sorted && no_duplicates_found ;

            //------------------------------------------------------------------
            // allocate J_work, if needed
            //------------------------------------------------------------------

            if (vdim > 1 && !known_sorted)
            { 
                // copy J_input into J_work, so the tuples can be sorted.
                // J_work can be smaller than J_input, so GB_cast_int is used.
                // This does not need to use GB_determine_i_is_32, since J_work
                // is not transplanted into the output matrix T.
                bool J_is_32_new = (vdim < UINT32_MAX) ;
                J_work = GB_MALLOC_MEMORY (nvals,
                    J_is_32_new ? sizeof (uint32_t) : sizeof (uint64_t),
                    J_work_size_handle) ;
                (*J_work_handle) = J_work ;
                if (J_work == NULL)
                { 
                    // out of memory
                    GB_FREE_WORKSPACE ;
                    return (GrB_OUT_OF_MEMORY) ;
                }
                // J_work = cast (J_input)
                if (do_burble)
                {
                    GBURBLE ("(cast J %d %d) ", J_is_32_new, J_is_32) ;
                }
                GB_cast_int (
                    J_work, J_is_32_new ? GB_UINT32_code : GB_UINT64_code,
                    J_input,    J_is_32 ? GB_UINT32_code : GB_UINT64_code,
                    nvals, nthreads) ;
                J_is_32 = J_is_32_new ;
            }
            else
            { 
                // J_work is a shallow copy of J_input.  The pointer is not
                // copied into (*J_work_handle), so it will not be freed.
                // J_input is not modified since J_work is not modified in
                // this case.  J_is_32 is unchanged.
                J_work = (void *) J_input ;
            }

            GB_IPTR (J_work, J_is_32) ;

        }
        else
        {

            //------------------------------------------------------------------
            // C is a typecasted GrB_Vector; check only I_input
            //------------------------------------------------------------------

            ASSERT (I_input != NULL) ;
            ASSERT (J_input == NULL) ;
            ASSERT (vdim == 1) ;

            int tid ;
            #pragma omp parallel for num_threads(nthreads) schedule(static) \
                reduction(&&:known_sorted) reduction(&&:no_duplicates_found)
            for (tid = 0 ; tid < nthreads ; tid++)
            {
                kbad [tid] = -1 ;
                int64_t kstart = tstart_slice [tid] ;
                int64_t kend  = tstart_slice [tid+1] ;
                int64_t ilast = GB_I_WORK (kstart - 1) ;

                for (int64_t k = kstart ; k < kend ; k++)
                {
                    // get k-th index from user input: (i)
                    int64_t i = GB_IGET (I_input, k) ;

                    if (i < 0 || i >= vlen)
                    { 
                        // halt if out of bounds
                        kbad [tid] = k ;
                        break ;
                    }

                    // check if the tuples are already sorted
                    known_sorted = known_sorted && (ilast <= i) ;

                    // check if this entry is a duplicate of the one before it
                    no_duplicates_found = no_duplicates_found &&
                        (!(ilast == i)) ;

                    // copy the tuple into the work arrays to be sorted
                    // I_work [k] = i ;
                    GB_ISET (I_work, k, i) ;

                    // log the last index seen
                    ilast = i ;
                }
            }

            // collect the report from each thread
            for (int tid = 0 ; tid < nthreads ; tid++)
            {
                if (kbad [tid] >= 0)
                { 
                    // invalid index
                    uint64_t i = GB_IGET (I_input, kbad [tid]) ;
                    GB_FREE_WORKSPACE ;
                    GB_ERROR (GrB_INDEX_OUT_OF_BOUNDS,
                        "index (" GBu ") out of bounds, must be < (" GBd ")",
                        i, vlen) ;
                }
            }
        }

        //----------------------------------------------------------------------
        // determine if duplicates are possible
        //----------------------------------------------------------------------

        // The input is now known to be sorted, or not.  If it is sorted, and
        // if no duplicates were found, then it is known to have no duplicates.
        // Otherwise, duplicates might appear, but a sort is required first to
        // check for duplicates.

        known_no_duplicates = known_sorted && no_duplicates_found ;
    }

    // I_input and J_input are now verified and have been copied into I_work
    // and J_work.  They are no longer used below.

    //--------------------------------------------------------------------------
    // STEP 2: sort the tuples in ascending order
    //--------------------------------------------------------------------------

    // If the tuples are known to already be sorted, Step 2 is skipped.  In
    // that case, K_work is NULL (not allocated), which implicitly means that
    // K_work [k] = k for all k = 0:nvals-1.  K_work is always NULL if Sx and
    // Tx are iso.

    if (!known_sorted)
    {

        //----------------------------------------------------------------------
        // allocate K_work workspace (not needed if T and Sx are iso)
        //----------------------------------------------------------------------

        if (!S_iso)
        {
            // create the k part of each tuple
            K_work = GB_MALLOC_MEMORY (nvals,
                K_is_32 ? sizeof (uint32_t) : sizeof (uint64_t),
                &K_work_size) ;
            if (K_work == NULL)
            { 
                // out of memory
                GB_FREE_WORKSPACE ;
                return (GrB_OUT_OF_MEMORY) ;
            }

            GB_IPTR (K_work, K_is_32) ;

            // The k part of each tuple (i,k) or (j,i,k) records the original
            // position of the tuple in the input list.  This allows an
            // unstable sorting algorithm to be used.  Since k is unique, it
            // forces the result of the sort to be stable regardless of whether
            // or not the sorting algorithm is stable.  It also keeps track of
            // where the numerical value of the tuple can be found; it is in
            // Sx[k] for the tuple (i,k) or (j,i,k), regardless of where the
            // tuple appears in the list after it is sorted.
            int64_t k ;
            #pragma omp parallel for num_threads(nthreads) schedule(static)
            for (k = 0 ; k < nvals ; k++)
            { 
                GB_ISET (K_work, k, k) ;
            }
        }

        //----------------------------------------------------------------------
        // sort all the tuples
        //----------------------------------------------------------------------

        if (vdim > 1)
        {

            //------------------------------------------------------------------
            // sort a set of (j,i,k) tuples
            //------------------------------------------------------------------

            if (S_iso)
            { 
                // K_work is NULL; only sort (j,i)
                GB_OK (GB_msort_2 (J_work, J_is_32, I_work, I_is_32, nvals,
                    nthreads)) ;
            }
            else
            { 
                GB_OK (GB_msort_3 (J_work, J_is_32, I_work, I_is_32,
                    K_work, K_is_32, nvals, nthreads)) ;
            }

        }
        else
        {

            //------------------------------------------------------------------
            // sort a set of (i,k) tuples
            //------------------------------------------------------------------

            if (S_iso)
            { 
                // K_work is NULL; only sort (i)
                GB_OK (GB_msort_1 (I_work, I_is_32, nvals, nthreads)) ;
            }
            else
            { 
                GB_OK (GB_msort_2 (I_work, I_is_32, K_work, K_is_32, nvals,
                    nthreads)) ;
            }
        }
    }

    //--------------------------------------------------------------------------
    // STEP 2 finished: the tuples are now sorted
    //--------------------------------------------------------------------------

    #ifdef GB_DEBUG
    if (vdim > 1)
    {
        int64_t ilast = -1 ;
        int64_t jlast = -1 ;
        for (int64_t k = 0 ; k < nvals ; k++)
        {
            int64_t i = GB_IGET (I_work, k) ;
            int64_t j = GB_IGET (J_work, k) ;
            ASSERT ((jlast < j) || (jlast == j && ilast <= i)) ;
            ilast = i ;
            jlast = j ;
        }
    }
    else
    {
        int64_t ilast = -1 ;
        for (int64_t k = 0 ; k < nvals ; k++)
        {
            int64_t i = GB_IGET (I_work, k) ;
            ASSERT (ilast <= i) ;
            ilast = i ;
        }
    }
    #endif

    //--------------------------------------------------------------------------
    // STEP 3: count vectors and duplicates in each slice
    //--------------------------------------------------------------------------

    // Duplicates are located, counted and their indices negated.  The # of
    // vectors in each slice is counted.  If the indices are known to not have
    // duplicates, then only the vectors are counted.  Counting the # of
    // vectors is skipped if already done by Step 1.

    if (known_no_duplicates)
    {

        //----------------------------------------------------------------------
        // no duplicates: just count # vectors in each slice
        //----------------------------------------------------------------------

        // This is much faster, particularly if the # of vectors in each slice
        // has already been computed.

        #ifdef GB_DEBUG
        {
            // assert that there are no duplicates
            int64_t ilast = -1, jlast = -1 ;
            for (int64_t t = 0 ; t < nvals ; t++)
            {
                int64_t i = GB_I_WORK (t), j = GB_J_WORK (t) ;
                bool is_duplicate = (i == ilast && j == jlast) ;
                ASSERT (!is_duplicate) ;
                ilast = i ; jlast = j ;
            }
        }
        #endif

        if (vdim <= 1)
        {

            // all tuples appear in at most one vector, and there are no
            // duplicates, so there is no need to scan I_work or J_work.

            for (int tid = 0 ; tid < nthreads ; tid++)
            { 
                int64_t tstart = tstart_slice [tid] ;
                int64_t tend   = tstart_slice [tid+1] ;
                tnvec_slice [tid] = 0 ;
                tnz_slice   [tid] = tend - tstart ;
            }
            tnvec_slice [0] = (nvals == 0) ? 0 : 1 ;

        }
        else
        {

            // count the # of unique vector indices in J_work.  No need to scan
            // I_work since there are no duplicates to be found.  Also no need
            // to compute them if already found in Step 1. 

            if (!tnvec_and_tnz_slice_computed)
            {

                int tid ;
                #pragma omp parallel for num_threads(nthreads) schedule(static)
                for (tid = 0 ; tid < nthreads ; tid++)
                {
                    int64_t my_tnvec = 0 ;
                    int64_t tstart = tstart_slice [tid] ;
                    int64_t tend   = tstart_slice [tid+1] ;
                    int64_t jlast  = GB_J_WORK (tstart-1) ;
                    for (int64_t t = tstart ; t < tend ; t++)
                    {
                        // get the t-th tuple
                        int64_t j = GB_IGET (J_work, t) ;
                        if (j > jlast)
                        { 
                            // vector j starts in this slice
                            my_tnvec++ ;
                            jlast = j ;
                        }
                    }
                    tnvec_slice [tid] = my_tnvec ;
                    tnz_slice   [tid] = tend - tstart ;
                }
            }
        }

    }
    else
    {

        //----------------------------------------------------------------------
        // look for duplicates and count # vectors in each slice
        //----------------------------------------------------------------------

        for (int tid = 0 ; tid < nthreads ; tid++)
        { 
            int64_t tstart = tstart_slice [tid] ;
            ilast_slice [tid] = GB_I_WORK (tstart-1) ;
        }

        int tid ;
        #pragma omp parallel for num_threads(nthreads) schedule(static)
        for (tid = 0 ; tid < nthreads ; tid++)
        {
            int64_t my_tnvec = 0 ;
            int64_t my_ndupl = 0 ;
            int64_t tstart = tstart_slice [tid] ;
            int64_t tend   = tstart_slice [tid+1] ;
            int64_t ilast  = ilast_slice [tid] ;
            int64_t jlast  = GB_J_WORK (tstart-1) ;

            for (int64_t t = tstart ; t < tend ; t++)
            {
                // get the t-th tuple
                int64_t i = GB_IGET (I_work, t) ;
                int64_t j = GB_J_WORK (t) ;

                // tuples are now sorted but there may be duplicates
                ASSERT ((jlast < j) || (jlast == j && ilast <= i)) ;

                // check if (j,i,k) is a duplicate
                if (i == ilast && j == jlast)
                { 
                    // flag the tuple as a duplicate
                    // I_work [t] = duplicate_entry
                    GB_ISET (I_work, t, duplicate_entry) ;
                    my_ndupl++ ;
                    // the sort places earlier duplicate tuples (with smaller
                    // k) after later ones (with larger k).
                    #ifdef GB_DEBUG
                    int64_t kprev = GB_K_WORK (t-1) ;
                    int64_t kthis = GB_K_WORK (t) ;
                    ASSERT (kprev < kthis) ;
                    #endif
                }
                else
                {
                    // this is a new tuple
                    if (j > jlast)
                    { 
                        // vector j starts in this slice
                        my_tnvec++ ;
                        jlast = j ;
                    }
                    ilast = i ;
                }
            }
            tnvec_slice [tid] = my_tnvec ;
            tnz_slice   [tid] = (tend - tstart) - my_ndupl ;
        }
    }

    #undef  GB_K_WORK
    #define GB_K_WORK(t) (K_work ? GB_IGET (K_work, t) : (t))

    //--------------------------------------------------------------------------
    // find total # of vectors and duplicates in all tuples
    //--------------------------------------------------------------------------

    // Replace tnvec_slice with its cumulative sum, after which each slice tid
    // will be responsible for the # vectors in T that range from tnvec_slice
    // [tid] to tnvec_slice [tid+1]-1.
    GB_cumsum1_64 ((uint64_t *) tnvec_slice, nthreads) ;
    int64_t tnvec = tnvec_slice [nthreads] ;

    // Replace tnz_slice with its cumulative sum
    GB_cumsum1_64 ((uint64_t *) tnz_slice, nthreads) ;

    // find the total # of final entries, after assembling duplicates
    int64_t tnz = tnz_slice [nthreads] ;
    int64_t ndupl = nvals - tnz ;

    Tp_is_32 = GB_determine_p_is_32 (Tp_is_32, tnz) ;        // OK

    //--------------------------------------------------------------------------
    // allocate T; always hypersparse
    //--------------------------------------------------------------------------

    // allocate T; allocate T->p and T->h but do not initialize them.
    // T is always hypersparse.  The header T always exists on input, as
    // either a static or dynamic header.
    GB_OK (GB_new (&T, // always hyper, existing header
        ttype, vlen, vdim, GB_ph_malloc, is_csc,
        GxB_HYPERSPARSE, GB_ALWAYS_HYPER, tnvec,
        Tp_is_32, Tj_is_32, Ti_is_32)) ;

    ASSERT (T->p != NULL) ;
    ASSERT (T->h != NULL) ;
    ASSERT (T->b == NULL) ;
    ASSERT (T->i == NULL) ;
    ASSERT (T->x == NULL) ;

    T->iso = S_iso ;    // T is iso if and only if Sx is iso
    do_burble = do_burble && (vlen > 1 || vdim > 1) && (nvals > 1) ;
    if (do_burble)
    {
        if (S_iso)
        { 
            GBURBLE ("(iso build, %d threads) ", nthreads) ;
        }
        else
        { 
            GBURBLE ("(build, %d threads) ", nthreads) ;
        }
    }

    //--------------------------------------------------------------------------
    // STEP 4: construct the vector pointers and hyperlist for T
    //--------------------------------------------------------------------------

    // Step 4 scans the J_work indices and constructs T->h and T->p.

    void *Tp = T->p ;
    void *Th = T->h ;
    GB_IDECL (Tp, , u) ; GB_IPTR (Tp, Tp_is_32) ;
    GB_IDECL (Th, , u) ; GB_IPTR (Th, Tj_is_32) ;

    if (vdim <= 1)
    {

        //----------------------------------------------------------------------
        // special case for vectors
        //----------------------------------------------------------------------

        ASSERT (tnvec == 0 || tnvec == 1) ;
        if (tnvec > 0)
        { 
            GB_ISET (Tp, 0, 0) ;    // Tp [0] = 0
            GB_ISET (Th, 0, 0) ;    // Th [0] = 0 ;
        }

    }
    else if (ndupl == 0)
    {

        //----------------------------------------------------------------------
        // no duplicates appear
        //----------------------------------------------------------------------

        int tid ;
        #pragma omp parallel for num_threads(nthreads) schedule(static)
        for (tid = 0 ; tid < nthreads ; tid++)
        {
            int64_t my_tnvec = tnvec_slice [tid] ;
            int64_t tstart   = tstart_slice [tid] ;
            int64_t tend     = tstart_slice [tid+1] ;
            int64_t jlast    = GB_J_WORK (tstart-1) ;
            for (int64_t t = tstart ; t < tend ; t++)
            {
                // get the t-th tuple
                int64_t j = GB_J_WORK (t) ;
                if (j > jlast)
                { 
                    // vector j starts in this slice
                    GB_ISET (Th, my_tnvec, j) ;     // Th [my_tnvec] = j ;
                    GB_ISET (Tp, my_tnvec, t) ;     // Tp [my_tnvec] = t ;
                    my_tnvec++ ;
                    jlast = j ;
                }
            }
        }

    }
    else
    {

        //----------------------------------------------------------------------
        // it is known that at least one duplicate appears
        //----------------------------------------------------------------------

        int tid ;
        #pragma omp parallel for num_threads(nthreads) schedule(static)
        for (tid = 0 ; tid < nthreads ; tid++)
        {
            int64_t my_tnz   = tnz_slice [tid] ;
            int64_t my_tnvec = tnvec_slice [tid] ;
            int64_t tstart   = tstart_slice [tid] ;
            int64_t tend     = tstart_slice [tid+1] ;
            int64_t jlast    = GB_J_WORK (tstart-1) ;
            for (int64_t t = tstart ; t < tend ; t++)
            {
                // get the t-th tuple
                int64_t i = GB_IGET (I_work, t) ;
                int64_t j = GB_J_WORK (t) ;
                // a duplicate index i is set to duplicate_entry
                if (i != duplicate_entry)
                {
                    // this is a new tuple; not a duplicate
                    if (j > jlast)
                    { 
                        // vector j starts in this slice 
                        GB_ISET (Th, my_tnvec, j) ;     // Th [my_tnvec]=j
                        GB_ISET (Tp, my_tnvec, my_tnz) ;// Tp [my_tnvec]=my_tnz
                        my_tnvec++ ;
                        jlast = j ;
                    }
                    my_tnz++ ;
                }
            }
        }
    }

    // log the end of the last vector
//  T->nvec_nonempty = tnvec ;
    GB_nvec_nonempty_set (T, tnvec) ;
    T->nvec = tnvec ;
    GB_ISET (Tp, tnvec, tnz) ;  // Tp [tnvec] = tnz ;
    T->nvals = tnz ;
    ASSERT (T->nvec == T->plen || (T->plen == 1 && T->nvec == 0)) ;
    T->magic = GB_MAGIC ;

    //--------------------------------------------------------------------------
    // free J_work if it has been allocated
    //--------------------------------------------------------------------------

    // J_work might be aliased to J_input, and in this case, J_work_handle
    // is NULL.  J_input is not modified.
    ASSERT (J_work_handle != NULL) ;
    GB_FREE_MEMORY (J_work_handle, *J_work_size_handle) ;
    J_work = NULL ;
    J_work32 = NULL ;
    J_work64 = NULL ;

    //--------------------------------------------------------------------------
    // allocate T->i
    //--------------------------------------------------------------------------

    if (ndupl == 0)
    {

        //----------------------------------------------------------------------
        // no duplicates: copy/transplant I_work into T->i
        //----------------------------------------------------------------------

        if (Ti_is_32 == I_is_32)
        { 
            // shrink I_work from size ijslen to size tnz
            if (tnz < ijslen)
            { 
                // this cannot fail since the size is shrinking.
                bool ok ;
                GB_REALLOC_MEMORY (I_work, tnz,
                    I_is_32 ? sizeof (uint32_t) : sizeof (uint64_t),
                    I_work_size_handle, &ok) ;
                ASSERT (ok) ;
            }
            // transplant I_work into T->i
            T->i = I_work ; T->i_size = (*I_work_size_handle) ;
        }
        else
        { 
            // T->i and I_work have different sizes, so they must be typecast
            T->i = GB_MALLOC_MEMORY (tnz,
                Ti_is_32 ? sizeof (int32_t) : sizeof (int64_t), &(T->i_size)) ;
            if (T->i != NULL)
            { 
                // T->i = cast (I_work), but use uint32/64 for T->i since
                // I_work has no zombies.
                if (do_burble)
                { 
                    GBURBLE ("(cast Ti %d %d) ", Ti_is_32, I_is_32) ;
                }
                GB_cast_int (
                    T->i,  Ti_is_32 ? GB_UINT32_code : GB_UINT64_code,
                    I_work, I_is_32 ? GB_UINT32_code : GB_UINT64_code,
                    tnz, nthreads) ;
            }
            // free I_work
            GB_FREE_MEMORY (I_work_handle, *I_work_size_handle) ;
        }

        // I_work has been free or transplanted into T->i
        I_work = NULL ;
        I_work32 = NULL ;
        I_work64 = NULL ;
        (*I_work_handle) = NULL ;
        (*I_work_size_handle) = 0 ;
    }
    else
    { 

        //----------------------------------------------------------------------
        // duplicates exist: T->i is computed from I_work in step 5
        //----------------------------------------------------------------------

        // duplicates exist, so allocate a new T->i.  I_work must be freed later
        T->i = GB_MALLOC_MEMORY (tnz,
            Ti_is_32 ? sizeof (int32_t) : sizeof (int64_t), &(T->i_size)) ;
    }

    if (T->i == NULL)
    { 
        // out of memory
        GB_phybix_free (T) ;
        GB_FREE_WORKSPACE ;
        return (GrB_OUT_OF_MEMORY) ;
    }

    T->i_is_32 = Ti_is_32 ;
    void *restrict Ti = T->i ;
    GB_IDECL (Ti, , ) ; GB_IPTR (Ti, Ti_is_32) ;

    //==========================================================================
    // numerical phase of the build: assemble any duplicates
    //==========================================================================

    // The tuples have been sorted.  Assemble any duplicates with a switch
    // factory of built-in workers, or four generic workers.  The vector
    // pointers T->p and hyperlist T->h (if hypersparse) have already been
    // computed.

    // If there are no duplicates, T->i holds the row indices of the tuple.
    // Otherwise, the row indices are still in I_work.  K_work holds the
    // positions of each tuple in the array Sx.  The tuples are sorted so that
    // duplicates are adjacent to each other and they appear in the order they
    // appeared in the original tuples.  This method assembles the duplicates
    // and computes T->i and T->x from I_work, K_work, and Sx.  into T, becoming
    // T->i.  If no duplicates appear, T->i is already computed, and Sx just
    // needs to be copied and permuted into T->x.

    // The (i,k,Sx[k]) tuples are held in two integer arrays: (1) I_work or
    // T->i, and (2) K_work, and an array Sx of numerical values.  Sx has not
    // been sorted, nor even accessed yet.  It is identical to the original
    // unsorted tuples.  The (i,k,Sx[k]) tuple holds the row index i, the
    // position k, and the value Sx [k].  This entry becomes T(i,j) = Sx [k] in
    // the matrix T, and duplicates (if any) are assembled via the dup
    // operator.

    // With GB_build, there can be 1 to 2 different types.
    //      T->type is identical to the types of x,y,z for z=dup(x,y).
    //      dup is never NULL and all its three types are the same
    //      The type of Sx (stype) can be different but must be compatible
    //          with T->type

    // With GB_wait, there can be 1 to 5 different types:
    //      The pending tuples are in Sx, of type stype which must be
    //          compatible with dup->ytype and T->type
    //      z = dup (x,y): can be NULL or have 1 to 3 different types
    //      T->type: must be compatible with all above types.
    //      dup may be NULL, in which case it is assumed be the implicit SECOND
    //          operator, with all three types equal to T->type

    //--------------------------------------------------------------------------
    // construct the SECOND operator if dup is NULL
    //--------------------------------------------------------------------------

    struct GB_BinaryOp_opaque dup_header ;
    if (dup == NULL)
    { 
        // z = SECOND (x,y) where all three types are the same as ttype
        // T(i,j) = (ttype) Sx(k) will be done for all tuples.  If dup is
        // SECOND_UDT, dup->binop_function will be NULL; this is OK.
        dup = GB_binop_second (ttype, &dup_header) ;
        ASSERT (dup != NULL && GB_op_is_second (dup, ttype)) ;
    }

    //--------------------------------------------------------------------------
    // get the dup operator
    //--------------------------------------------------------------------------

    // T(i,j) = (ttype) Sx[k] will be done for the first tuple.  for subsequent
    // tuples: T(i,j) += Sx[k], via the dup operator and typecasting:
    //
    //      y = (dup->ytype) Sx[k]
    //      x = (dup->xtype) T(i,j)
    //      z = (dup->ztype) dup (x,y)      // or z = y if fdup is NULL
    //      T(i,j) = (ttype) z

    ASSERT_BINARYOP_OK (dup, "dup for build_factory", GB0) ;
    GrB_Type xtype = dup->xtype ;
    GrB_Type ytype = dup->ytype ;
    GrB_Type ztype = dup->ztype ;
    GxB_binary_function fdup = dup->binop_function ;
    bool op_is_2nd = GB_op_is_second (dup, ttype) ;

    //--------------------------------------------------------------------------
    // get the sizes and codes of each type
    //--------------------------------------------------------------------------

    GB_Type_code tcode = ttype->code ;
    const size_t tsize = ttype->size ;

    GB_Type_code zcode = ztype->code ;
    GB_Type_code xcode = xtype->code ;
    GB_Type_code ycode = ytype->code ;

    ASSERT (GB_Type_compatible (ttype, stype)) ;    // T(i,j) = (ttype) Sx
    ASSERT (GB_Type_compatible (ytype, stype)) ;    // y = (ytype) Sx
    ASSERT (GB_Type_compatible (xtype, ttype)) ;    // x = (xtype) T(i,j)
    ASSERT (GB_Type_compatible (ttype, ztype)) ;    // T(i,j) = (ttype) z

    size_t zsize = ztype->size ;
    size_t xsize = xtype->size ;
    size_t ysize = ytype->size ;

    // no typecasting if all 5 types are the same
    bool nocasting = (ttype == stype) &&
        (ttype == xtype) && (stype == ytype) && (ttype == ztype) ;

    ASSERT_TYPE_OK (ttype, "ttype for build_factory", GB0) ;
    ASSERT_TYPE_OK (stype, "stype for build_factory", GB0) ;
    ASSERT_TYPE_OK (xtype, "xtype for build_factory", GB0) ;
    ASSERT_TYPE_OK (ytype, "ytype for build_factory", GB0) ;
    ASSERT_TYPE_OK (ztype, "ztype for build_factory", GB0) ;

    //--------------------------------------------------------------------------
    // STEP 5: assemble the tuples
    //--------------------------------------------------------------------------

    bool copy_S_into_T = (nocasting && known_sorted && ndupl == 0) ;
    info = GrB_NO_VALUE ;

    if (copy_S_into_T && S_work != NULL)
    { 

        //----------------------------------------------------------------------
        // transplant S_work into T->x
        //----------------------------------------------------------------------

        // No typecasting is needed, the tuples were originally in sorted
        // order, and no duplicates appear.  All that is required is to copy Sx
        // into Tx.  Sx can be directly transplanted into T->x since Sx is
        // provided as S_work.  GB_builder must either transplant or free
        // S_work.  The transplant can be used by GB_wait, whenever the tuples
        // are already sorted, with no duplicates, and no typecasting is
        // needed, since S_work is always A->Pending->x.  T and Sx may be iso
        // or non-iso.

        T->x = S_work ; T->x_size = (*S_work_size_handle) ;
        S_work = NULL ;
        (*S_work_handle) = NULL ;
        (*S_work_size_handle) = 0 ;

        int64_t tx_size_required = tnz * tsize ;
        if (2 * tx_size_required < T->x_size)
        { 
            // shrink the size of T->x
            bool ok = true ;
            GB_REALLOC_MEMORY (T->x, tnz, tsize, &(T->x_size), &ok) ;
        }
        info = GrB_SUCCESS ;

    }
    else
    {

        //----------------------------------------------------------------------
        // allocate T->x
        //----------------------------------------------------------------------

        T->x = GB_XALLOC_MEMORY (false, S_iso, tnz, tsize, &(T->x_size)) ;
        if (T->x == NULL)
        { 
            // out of memory
            GB_phybix_free (T) ;
            GB_FREE_WORKSPACE ;
            return (GrB_OUT_OF_MEMORY) ;
        }

        GB_void *restrict Tx = (GB_void *) T->x ;

        ASSERT (GB_IMPLIES (nvals > 0, Sx != NULL)) ;

        if (nvals == 0)
        { 

            // nothing to do
            info = GrB_SUCCESS ;

        }
        else if (copy_S_into_T)
        { 

            //------------------------------------------------------------------
            // copy Sx into T->x
            //------------------------------------------------------------------

            // No typecasting is needed, the tuples were originally in sorted
            // order, and no duplicates appear.  All that is required is to
            // copy Sx into Tx.  Sx cannot be transplanted into T->x since
            // S_work is NULL and S_input cannot be modified by GB_builder.

            ASSERT (S_work == NULL) ;
            ASSERT (Sx == S_input) ;
            GB_memcpy (Tx, Sx, (S_iso ? 1 : nvals) * tsize, nthreads) ;
            info = GrB_SUCCESS ;

        }
        else if (nocasting)
        { 

            //------------------------------------------------------------------
            // assemble the values, Sx, into T, no typecasting needed
            //------------------------------------------------------------------

            // Sx (either S_work or S_input) must be permuted and copied into
            // T->x, since the tuples had to be sorted, or duplicates appear.
            // Any duplicates are now assembled.

            // There are 44 common cases of this function for built-in types
            // and 8 associative operators: MIN, MAX, PLUS, TIMES for 10 types
            // (all but boolean; and OR, AND, XOR, and EQ for boolean.

            // In addition, the FIRST and SECOND operators are hard-coded, for
            // another 22 workers, since SECOND is used by GB_wait and since
            // FIRST is useful for keeping the first tuple seen.  It is
            // controlled by the GB_INCLUDE_SECOND_OPERATOR definition, so they
            // do not appear in GB_reduce_to_* where the FIRST and SECOND
            // operators are not needed.

            // Early exit cannot be exploited, so the terminal is ignored.

            if (S_iso)
            { 

                //--------------------------------------------------------------
                // via the iso kernel
                //--------------------------------------------------------------

                // T and Sx are iso; set iso value and delete duplicates
                memcpy (Tx, Sx, tsize) ;
                #define GB_ISO_BUILD
                #include "builder/template/GB_bld_template.c"
                info = GrB_SUCCESS ;

            }
            else
            { 

                //--------------------------------------------------------------
                // via the factory kernel
                //--------------------------------------------------------------

                // T and Sx are not iso; call in the workers

                #ifndef GBCOMPACT
                GB_IF_FACTORY_KERNELS_ENABLED
                { 

                    //----------------------------------------------------------
                    // define the worker for the switch factory
                    //----------------------------------------------------------

                    #define GB_bld(opname,aname) \
                        GB (_bld_ ## opname ## aname)

                    #define GB_BLD_WORKER(opname,aname,st_type)              \
                    {                                                        \
                        info = GB_bld (opname, aname) ((st_type *) Tx, Ti,   \
                            Ti_is_32, (st_type *) Sx, nvals, ndupl,          \
                            I_work, I_is_32, K_work, K_is_32,                \
                            duplicate_entry,                                 \
                            tstart_slice, tnz_slice, nthreads) ;             \
                    }                                                        \
                    break ;

                    //----------------------------------------------------------
                    // launch the switch factory
                    //----------------------------------------------------------

                    // controlled by opcode and tcode
                    GB_Opcode opcode = dup->opcode ;
                    if (tcode == GB_BOOL_code)
                    { 
                        opcode = GB_boolean_rename (opcode) ;
                    }
                    #include "builder/factory/GB_bld_factory.c"
                }
                #endif
            }

            //------------------------------------------------------------------
            // via the JIT or PreJIT kernel
            //------------------------------------------------------------------

            if (info == GrB_NO_VALUE)
            { 
                info = GB_build_jit (Tx, T->i, Ti_is_32, Sx, ttype, stype, dup,
                    nvals, ndupl, I_work, I_is_32, K_work, K_is_32,
                    K_work == NULL, duplicate_entry, tstart_slice, tnz_slice,
                    nthreads) ;
            }

            //------------------------------------------------------------------
            // via the generic kernel
            //------------------------------------------------------------------

            if (info == GrB_NO_VALUE)
            {
                if (do_burble) GBURBLE ("(generic build) ") ;

                //--------------------------------------------------------------
                // no typecasting, but use the fdup function pointer and memcpy
                //--------------------------------------------------------------

                // Either the fdup operator or type of Sx and T are
                // user-defined, or fdup is not handled by the GB_bld_factory,
                // or some combination of these conditions.  User-defined types
                // cannot be typecasted, so this handles all user-defined
                // types.

                // Tx [p] = (ttype) Sx [k], but with no typecasting
                #undef  GB_BLD_COPY
                #define GB_BLD_COPY(Tx,p,Sx,k)               \
                    memcpy (Tx +((p)*tsize), Sx +((k)*tsize), tsize) ;

                if (op_is_2nd)
                { 

                    //----------------------------------------------------------
                    // dup is the SECOND operator, with no typecasting
                    //----------------------------------------------------------

                    // Tx [p] += (ttype) Sx [k], but 2nd op and no typecasting
                    #undef  GB_BLD_DUP
                    #define GB_BLD_DUP(Tx,p,Sx,k) GB_BLD_COPY(Tx,p,Sx,k)
                    #include "builder/template/GB_bld_template.c"

                }
                else
                { 

                    //----------------------------------------------------------
                    // dup is another operator, with no typecasting needed
                    //----------------------------------------------------------

                    // Tx [p] += (ttype) Sx [k], but with no typecasting
                    #undef  GB_BLD_DUP
                    #define GB_BLD_DUP(Tx,p,Sx,k)       \
                        fdup (Tx +((p)*tsize), Tx +((p)*tsize), Sx+((k)*tsize));
                    #include "builder/template/GB_bld_template.c"
                }

                info = GrB_SUCCESS ;
            }

        }
        else
        {

            //------------------------------------------------------------------
            // assemble the values Sx into T, typecasting as needed
            //------------------------------------------------------------------

            //------------------------------------------------------------------
            // via the JIT or PreJIT kernel
            //------------------------------------------------------------------

            if (info == GrB_NO_VALUE)
            { 
                info = GB_build_jit (Tx, T->i, Ti_is_32, Sx, ttype, stype, dup,
                    nvals, ndupl, I_work, I_is_32, K_work, K_is_32,
                    K_work == NULL, duplicate_entry, tstart_slice, tnz_slice,
                    nthreads) ;
            }

            //------------------------------------------------------------------
            // via the generic kernel
            //------------------------------------------------------------------

            if (info == GrB_NO_VALUE)
            {

                if (do_burble)
                {
                    GBURBLE ("(generic build with typecast) ") ;
                }

                // If T and Sx are iso, no typecasting is ever done, so this
                // method is not used in that case.
                ASSERT (!S_iso) ;

                // Sx (either S_work or S_input) must be permuted and copied
                // into T->x, since the tuples had to be sorted, or duplicates
                // appear.  Any duplicates are now assembled.  Not all of the 5
                // types are the same, but all of them are built-in since
                // user-defined types cannot be typecasted.

                const GB_Type_code Scode = stype->code ;
                const size_t ssize = stype->size ;
                GB_cast_function cast_S_to_T = GB_cast_factory (tcode, Scode) ;
                GB_cast_function cast_S_to_Y = GB_cast_factory (ycode, Scode) ;
                GB_cast_function cast_T_to_X = GB_cast_factory (xcode, tcode) ;
                GB_cast_function cast_Z_to_T = GB_cast_factory (tcode, zcode) ;

                // all types must be built-in
                ASSERT (Scode <= GB_FC64_code) ;
                ASSERT (tcode <= GB_FC64_code) ;
                ASSERT (xcode <= GB_FC64_code) ;
                ASSERT (ycode <= GB_FC64_code) ;
                ASSERT (zcode <= GB_FC64_code) ;

                // Tx [p] = (ttype) Sx [k], with typecasting
                #undef  GB_BLD_COPY
                #define GB_BLD_COPY(Tx,p,Sx,k)                   \
                    cast_S_to_T (Tx +((p)*tsize), Sx +((k)*ssize), ssize) ;

                if (op_is_2nd)
                { 

                    //----------------------------------------------------------
                    // dup operator is the SECOND operator, with typecasting
                    //----------------------------------------------------------

                    // Tx [p] += (ttype) Sx [k], but 2nd op, with typecasting
                    #undef  GB_BLD_DUP
                    #define GB_BLD_DUP(Tx,p,Sx,k) GB_BLD_COPY(Tx,p,Sx,k)
                    #include "builder/template/GB_bld_template.c"

                }
                else
                { 

                    //----------------------------------------------------------
                    // dup is another operator, with typecasting required
                    //----------------------------------------------------------

                    // Tx [p] += Sx [k], with typecasting
                    #undef  GB_BLD_DUP
                    #define GB_BLD_DUP(Tx,p,Sx,k)                       \
                    {                                                   \
                        /* ywork = (ytype) Sx [k] */                    \
                        GB_void ywork [GB_VLA(ysize)] ;                 \
                        cast_S_to_Y (ywork, Sx +((k)*ssize), ssize) ;   \
                        /* xwork = (xtype) Tx [p] */                    \
                        GB_void xwork [GB_VLA(xsize)] ;                 \
                        cast_T_to_X (xwork, Tx +((p)*tsize), tsize) ;   \
                        /* zwork = f (xwork, ywork) */                  \
                        GB_void zwork [GB_VLA(zsize)] ;                 \
                        fdup (zwork, xwork, ywork) ;                    \
                        /* Tx [tnz-1] = (ttype) zwork */                \
                        cast_Z_to_T (Tx +((p)*tsize), zwork, zsize) ;   \
                    }

                    #include "builder/template/GB_bld_template.c"
                }
            }

            info = GrB_SUCCESS ;
        }
    }

    //--------------------------------------------------------------------------
    // free workspace and return result
    //--------------------------------------------------------------------------

    GB_FREE_WORKSPACE ;
    if (info == GrB_SUCCESS)
    { 
        T->jumbled = false ;
        ASSERT_MATRIX_OK (T, "T built", GB0) ;
        ASSERT (GB_IS_HYPERSPARSE (T)) ;
    }
    return (info) ;
}

